#!/usr/bin/env Rscript
"> three-way_per-pos_beta.R <
Reads in betas from EM-seq, WGBS and ONT Cas9 experiments, and compares them
on a per-position basis.
" -> doc
setwd('~/csiro/stopwatch/cpgberus/16_loci_specific_three_way/')
RDNA_FASTA <- '../data/homo_sapiens.45s.fa.gz'
EMSEQ_COV <- Sys.glob('./WR*ER.hsap_45s.cov.gz')
WGBS_COV <- Sys.glob('./WR*WR.hsap_45s.cov.gz')
ONT_BED <- Sys.glob('./WR*.bed.gz')
suppressPackageStartupMessages({
library(cowplot)
library(eulerr)
library(ggplot2)
library(RColorBrewer)
})
# function to pretty-print diagnostic messages
diag_message <- function(...) {
message('[', format(Sys.time(), "%H:%M:%S"), '] ', ...)
}
# function to annotate line of best fit in scatterplot
lm_eqn <- function(df, x, y) {
# modified from https://stackoverflow.com/questions/7549694/add-regression-line-equation-and-r2-on-graph
model <- lm(as.formula(paste(y, '~', x)), df)
eq <- substitute(
italic(y) == c + m %.% italic(x)*','~italic(r)^2 == r2*','~italic(p) == pval,
list(c = format(unname(coef(model)[1]), digits=3),
m = format(unname(coef(model)[2]), digits=3),
r2 = format(summary(model)$r.squared, digits=4),
pval = format(summary(model)$coefficients[2,4], digits=1)))
as.character(as.expression(eq))
}
# function to calculate sum/len of a binary string
pct_of_ones <- function(binary_string) {
int_vector <- as.integer(unlist(strsplit(binary_string, split = "")))
sum(int_vector) / length(int_vector) * 100
}
# read data
rdna_seq <- scan(RDNA_FASTA, what='character', skip=1)
# read EM-seq and WGBS data into separate dfs, but logic-wise they're fairly similar
emseq_df <- data.frame()
for (f in EMSEQ_COV) {
sample_id <- strsplit(f, '.', fixed=TRUE)[[1]][2]
sample_id <- gsub('/', '', sample_id, fixed=TRUE)
temp_df <- read.delim(
f, header=FALSE,
col.names=c('scaf', 'pos', 'pos2', 'beta', 'meth', 'unmeth'))
temp_df$sample_id <- sample_id
emseq_df <- rbind(emseq_df, temp_df)
}
wgbs_df <- data.frame()
for (f in WGBS_COV) {
sample_id <- strsplit(f, '.', fixed=TRUE)[[1]][2]
sample_id <- gsub('/', '', sample_id, fixed=TRUE)
temp_df <- read.delim(
f, header=FALSE,
col.names=c('scaf', 'pos', 'pos2', 'beta', 'meth', 'unmeth'))
temp_df$sample_id <- sample_id
wgbs_df <- rbind(wgbs_df, temp_df)
}
# tidy up EM-seq/WGBS df
# first 1 kb is promoter sequence; make sure transcription starts at +1
emseq_df$pos <- emseq_df$pos - 1000
wgbs_df$pos <- wgbs_df$pos - 1000
emseq_df$cov <- emseq_df$meth + emseq_df$unmeth
wgbs_df$cov <- wgbs_df$meth + wgbs_df$unmeth
# ONT BEDs are "bedMethyl" files
# https://www.encodeproject.org/data-standards/wgbs/
ont_df <- data.frame()
for (f in ONT_BED) {
sample_id <- strsplit(f, '.', fixed=TRUE)[[1]][2]
sample_id <- gsub('/', '', sample_id, fixed=TRUE)
temp_df <- read.delim(
f, header=FALSE,
col.names=c('scaf', 'pos0', 'pos', 'name', 'score', 'strand',
'startcodon', 'stopcodon', 'color_rgb', 'cov', 'beta'))
temp_df$sample_id <- sample_id
ont_df <- rbind(ont_df, temp_df)
}
# tidy up ONT df
# first 11 kb is promoter sequence; make sure transcription starts at +1
ont_df$pos <- ont_df$pos - 11000
# set comparative window to [1, 13332], the transcribed 45S region. both df
# should have data in this range
emseq_df <- emseq_df[emseq_df$pos >= 1 & emseq_df$pos <= 13332, ]
wgbs_df <- wgbs_df[wgbs_df$pos >= 1 & wgbs_df$pos <= 13332, ]
ont_df <- ont_df[ont_df$pos >= 1 & ont_df$pos <= 13332, ]
# set a coverage threshold of 50 to increase confidence in beta values
diag_message('Filter `emseq_df` for positions with coverage >= 50. ',
'Original nrow: ', nrow(emseq_df), '; ',
'filtered nrow: ', nrow(emseq_df[emseq_df$cov >= 50, ]))
## [16:21:15] Filter `emseq_df` for positions with coverage >= 50. Original nrow: 15294; filtered nrow: 14641
emseq_df <- emseq_df[emseq_df$cov >= 50, ]
diag_message('Filter `wgbs_df` for positions with coverage >= 50. ',
'Original nrow: ', nrow(wgbs_df), '; ',
'filtered nrow: ', nrow(wgbs_df[wgbs_df$cov >= 50, ]))
## [16:21:15] Filter `wgbs_df` for positions with coverage >= 50. Original nrow: 15057; filtered nrow: 13672
wgbs_df <- wgbs_df[wgbs_df$cov >= 50, ]
diag_message('Filter `ont_df` for positions with coverage >= 50. ',
'Original nrow: ', nrow(ont_df), '; ',
'filtered nrow: ', nrow(ont_df[ont_df$cov >= 50, ]))
## [16:21:15] Filter `ont_df` for positions with coverage >= 50. Original nrow: 14928; filtered nrow: 14928
ont_df <- ont_df[ont_df$cov >= 50, ]
# calculate mean coverage for each method at this point
diag_message('Mean coverage of remaining positions are: ',
'EM-seq ', mean(emseq_df$cov), '; ',
'WGBS ', mean(wgbs_df$cov), '; ',
'ONT Cas9 ', mean(ont_df$cov))
## [16:21:15] Mean coverage of remaining positions are: EM-seq 946.86182637798; WGBS 526.671006436513; ONT Cas9 608.189643622722
# only keep relevant columns
emseq_df <- emseq_df[, c('pos', 'beta', 'sample_id')]
wgbs_df <- wgbs_df[, c('pos', 'beta', 'sample_id')]
ont_df <- ont_df[, c('pos', 'beta', 'sample_id')]
# convert long-to-wide
emseq_wide_df <- reshape(emseq_df, idvar='pos', timevar='sample_id', direction='wide')
colnames(emseq_wide_df) <- gsub('^beta.', '', colnames(emseq_wide_df))
emseq_wide_df <- emseq_wide_df[order(emseq_wide_df$pos), ]
wgbs_wide_df <- reshape(wgbs_df, idvar='pos', timevar='sample_id', direction='wide')
colnames(wgbs_wide_df) <- gsub('^beta.', '', colnames(wgbs_wide_df))
wgbs_wide_df <- wgbs_wide_df[order(wgbs_wide_df$pos), ]
ont_wide_df <- reshape(ont_df, idvar='pos', timevar='sample_id', direction='wide')
colnames(ont_wide_df) <- gsub('^beta.', '', colnames(ont_wide_df))
ont_wide_df <- ont_wide_df[order(ont_wide_df$pos), ]
# check how many rows have NA beta values in ANY of the four datasets (driven
# by insufficient coverage)
diag_message('Number of incomplete rows in `emseq_wide_df`: ',
sum(!complete.cases(emseq_wide_df)))
## [16:21:15] Number of incomplete rows in `emseq_wide_df`: 65
diag_message('Number of incomplete rows in `wgbs_wide_df`: ',
sum(!complete.cases(wgbs_wide_df)))
## [16:21:15] Number of incomplete rows in `wgbs_wide_df`: 109
diag_message('Number of incomplete rows in `ont_wide_df`: ',
sum(!complete.cases(ont_wide_df)))
## [16:21:15] Number of incomplete rows in `ont_wide_df`: 0
# hmm, not a lot. drop positions where methylation levels were NA in ANY of the
# four datasets
emseq_wide_df <- emseq_wide_df[complete.cases(emseq_wide_df), ]
wgbs_wide_df <- wgbs_wide_df[complete.cases(wgbs_wide_df), ]
ont_wide_df <- ont_wide_df[complete.cases(ont_wide_df), ]
# do WGBS datasets have fewer positions with meth data than EM-seq/ONT? check
# overlap of positions with methylation data across three different techniques
pos_list <- list(`EM-seq`=emseq_wide_df$pos,
WGBS=wgbs_wide_df$pos,
`ONT Cas9`=ont_wide_df$pos)
plot(venn(pos_list))

# ... yes.
# combine the wide dfs for more diagnostic plots
wide_df <- merge(emseq_wide_df, wgbs_wide_df, by='pos', all=TRUE,
suffixes=c('_emseq', '_wgbs'))
wide_df <- merge(wide_df, ont_wide_df, by='pos', all=TRUE)
# select for positions that are present in all 12 datasets
wide_df <- wide_df[complete.cases(wide_df), ]
rownames(wide_df) <- wide_df$pos
wide_df <- wide_df[, -1]
# calculate per-position MEAN methylation levels for every method separately
wide_df$meanER <- rowMeans(wide_df[, grepl('ER$', colnames(wide_df))])
wide_df$meanWR <- rowMeans(wide_df[, grepl('WR$', colnames(wide_df))])
wide_df$meanO <- rowMeans(wide_df[, grepl('O$', colnames(wide_df))])
head(wide_df)
## WR025V1ER WR025V9ER WR069V1ER WR069V9ER WR025V1WR WR025V9WR WR069V1WR
## 8 18.44603 17.27219 17.25632 17.11510 21.24736 18.87006 15.983607
## 9 16.88144 17.75188 16.67913 15.64516 15.53589 14.63940 13.067151
## 21 13.06954 12.84349 13.37719 12.09801 13.96825 12.98701 12.788260
## 22 13.12010 13.19832 11.97877 11.54472 11.60275 10.79295 8.780037
## 29 20.63397 20.46679 20.59041 20.81727 23.50381 20.00000 18.474758
## 30 20.64725 19.05099 20.63253 19.67742 18.20896 17.05171 16.558140
## WR069V9WR WR025V1O WR025V9O WR069V1O WR069V9O meanER meanWR meanO
## 8 18.95332 17.6 17.4 16.2 15.9 17.52241 18.76359 16.775
## 9 12.62799 16.3 21.1 19.7 12.4 16.73941 13.96761 17.375
## 21 12.91727 13.1 15.2 13.5 14.7 12.84706 13.16520 14.125
## 22 11.77156 18.3 19.2 17.5 13.3 12.46048 10.73683 17.075
## 29 23.66864 22.8 22.8 23.6 22.4 20.62711 21.41180 22.900
## 30 18.20303 28.5 38.3 29.2 22.9 20.00205 17.50546 29.725
# sanity check: should match the intersection of the three circles in venn diagram
nrow(wide_df) # expected: 3336
## [1] 3336
# plot a PCA to visualise overall profile of datasets
set.seed(1337)
pca <- prcomp(t(wide_df), scale=TRUE)
pca_coords <- as.data.frame(pca$x)
eigs <- pca$sdev ^ 2
pca_df <- data.frame(PC1=pca_coords$PC1,
PC2=pca_coords$PC2,
row.names=rownames(pca_coords))
pca_df$method <- rownames(pca_coords)
pca_df$method <- gsub('^.*ER$', 'EM-seq', pca_df$method)
pca_df$method <- gsub('^.*WR$', 'WGBS', pca_df$method)
pca_df$method <- gsub('^.*O$', 'ONT Cas9', pca_df$method)
pca_df$sample <- gsub('ER$|WR$|O$', '', rownames(pca_df))
g <- ggplot(pca_df, aes(x=PC1, y=PC2, color=method, fill=method, shape=sample)) +
geom_point(size=3, alpha=0.5) +
scale_shape_manual(values=21:25) +
xlab(paste0('PC1 (', round(eigs[1] / sum(eigs) * 100, 2), '%)')) +
ylab(paste0('PC2 (', round(eigs[2] / sum(eigs) * 100, 2), '%)')) +
theme_minimal(12) +
theme(legend.position='top', legend.box='vertical', legend.margin=margin())
print(g)

ggsave('three-way.pca.pdf', width=6, height=6)
# PCA indicates variation is largest across methods, much less variation across
# samples. which is why per-method means were computed. subsequent plots are
# based off these mean values
# plot EM-seq vs. WGBS
ggplot(wide_df, aes(x=meanER, y=meanWR)) +
geom_point(alpha=0.2) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanER', 'meanWR'), parse=TRUE) +
ggtitle('Per-position methylation levels, EM-seq vs. WGBS') +
xlab('EM-seq') +
ylab('WGBS') +
theme_minimal(12)

# plot EM-seq vs. ONT
ggplot(wide_df, aes(x=meanER, y=meanO)) +
geom_point(alpha=0.2) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanER', 'meanO'), parse=TRUE) +
ggtitle('Per-position methylation levels, EM-seq vs. ONT Cas9') +
xlab('EM-seq') +
ylab('ONT Cas9') +
theme_minimal(12)

# WGBS vs. ONT
ggplot(wide_df, aes(x=meanWR, y=meanO)) +
geom_point(alpha=0.2) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanWR', 'meanO'), parse=TRUE) +
ggtitle('Per-position methylation levels, WGBS vs. ONT Cas9') +
xlab('WGBS') +
ylab('ONT Cas9') +
theme_minimal(12)

# create a long df for plotting
long_df <- wide_df[, c('meanER', 'meanWR', 'meanO')]
long_df$pos <- as.numeric(rownames(long_df))
long_df <- reshape(long_df, direction='long',
idvar='pos',
varying=c('meanER', 'meanWR', 'meanO'), v.name='beta',
timevar='method', times=c('EM-seq', 'WGBS', 'ONT Cas9'))
# sort by pos, reset row numbering
long_df <- long_df[order(long_df$pos), ]
rownames(long_df) <- NULL
head(long_df)
## pos method beta
## 1 8 EM-seq 17.52241
## 2 8 WGBS 18.76359
## 3 8 ONT Cas9 16.77500
## 4 9 EM-seq 16.73941
## 5 9 WGBS 13.96761
## 6 9 ONT Cas9 17.37500
# calculate GC% in a window of WINDOW_BP
WINDOW_BP <- 101 # this value should be odd, so that the base at midpoint
# is sandwiched by equal num of bases upstream and downstream
bp_per_side <- (WINDOW_BP - 1) / 2
# create a string where C/G == 1 and A/T == 0, then
# GC% = sum(window) / len(window)
rdna_binary <- gsub('C', '1', rdna_seq, ignore.case=TRUE)
rdna_binary <- gsub('G', '1', rdna_binary, ignore.case=TRUE)
rdna_binary <- gsub('[^1]', '0', rdna_binary, ignore.case=TRUE)
gcpct_df <- data.frame(pos=(bp_per_side + 1):(nchar(rdna_seq) - bp_per_side),
gcpct=0)
gcpct_df$gcpct <- unlist(lapply(
gcpct_df$pos,
function(x) pct_of_ones(substr(rdna_binary, x - bp_per_side, x + bp_per_side))))
# first 1 kb is promoter sequence; make sure transcription starts at +1
gcpct_df$pos <- gcpct_df$pos - 1000
# plot beta across loci
g1 <- ggplot(long_df, aes(x=pos, y=beta, color=method)) +
geom_point(alpha=0.1) +
geom_smooth(method='loess', span=0.1) +
scale_color_manual(values=c('#1b9e77','#d95f02','#7570b3')) +
coord_cartesian(xlim=c(0, 13332)) +
ggtitle('Methylation levels across 45S loci') +
xlab('Position') +
ylab('Methylation level (%)') +
theme_minimal(12) +
theme(legend.position='top',
axis.title.x=element_blank(),
axis.text.x=element_blank())
g2 <- ggplot(gcpct_df, aes(x=pos, y=gcpct)) +
geom_smooth(method='loess', span=0.1) +
coord_cartesian(xlim=c(0, 13332)) +
xlab('Position') +
ylab('GC%') +
theme_minimal(12)
plot_grid(g1, g2, ncol=1, rel_heights=c(0.8, 0.2))
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# does GC% influence the discrepancies in beta? re-do scatterplot, but colour
# points by GC%
#
# merge dataframes together (inner-join), reset row names
wide_df <- merge(wide_df, gcpct_df, by.x=0, by.y='pos', all=FALSE)
colnames(wide_df)[1] <- 'pos'
wide_df$pos <- as.numeric(wide_df$pos)
wide_df <- wide_df[order(wide_df$pos), ]
rownames(wide_df) <- NULL
head(wide_df)
## pos WR025V1ER WR025V9ER WR069V1ER WR069V9ER WR025V1WR WR025V9WR WR069V1WR
## 1 8 18.44603 17.27219 17.25632 17.11510 21.24736 18.87006 15.983607
## 2 9 16.88144 17.75188 16.67913 15.64516 15.53589 14.63940 13.067151
## 3 21 13.06954 12.84349 13.37719 12.09801 13.96825 12.98701 12.788260
## 4 22 13.12010 13.19832 11.97877 11.54472 11.60275 10.79295 8.780037
## 5 29 20.63397 20.46679 20.59041 20.81727 23.50381 20.00000 18.474758
## 6 30 20.64725 19.05099 20.63253 19.67742 18.20896 17.05171 16.558140
## WR069V9WR WR025V1O WR025V9O WR069V1O WR069V9O meanER meanWR meanO
## 1 18.95332 17.6 17.4 16.2 15.9 17.52241 18.76359 16.775
## 2 12.62799 16.3 21.1 19.7 12.4 16.73941 13.96761 17.375
## 3 12.91727 13.1 15.2 13.5 14.7 12.84706 13.16520 14.125
## 4 11.77156 18.3 19.2 17.5 13.3 12.46048 10.73683 17.075
## 5 23.66864 22.8 22.8 23.6 22.4 20.62711 21.41180 22.900
## 6 18.20303 28.5 38.3 29.2 22.9 20.00205 17.50546 29.725
## gcpct
## 1 62.37624
## 2 63.36634
## 3 68.31683
## 4 67.32673
## 5 67.32673
## 6 66.33663
# plot EM-seq vs. WGBS and colour points by GC%
g3 <- ggplot(wide_df, aes(x=meanER, y=meanWR, color=gcpct)) +
geom_point(alpha=0.4) +
scale_color_distiller('GC%', palette='RdYlBu', limits=c(55, 95),
guide=guide_colorbar(direction='horizontal')) +
geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
coord_cartesian(xlim=c(0, 55), ylim=c(0, 55)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanER', 'meanWR'), parse=TRUE) +
ggtitle('Per-position methylation levels') +
xlab('EM-seq (%)') +
ylab('WGBS (%)') +
theme_minimal(12) +
theme(legend.position=c(0.75, 0.1))
# plot EM-seq vs. ONT Cas9 and colour points by GC%
g4 <- ggplot(wide_df, aes(x=meanER, y=meanO, color=gcpct)) +
geom_point(alpha=0.4) +
scale_color_distiller('GC%', palette='RdYlBu', limits=c(55, 95),
guide=guide_colorbar(direction='horizontal')) +
geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
coord_cartesian(xlim=c(0, 55), ylim=c(0, 55)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanER', 'meanO'), parse=TRUE) +
ggtitle('') +
xlab('EM-seq (%)') +
ylab('ONT Cas9 (%)') +
theme_minimal() +
theme(legend.position=c(0.75, 0.1))
# plot WGBS vs. ONT Cas9 and colour points by GC%
g5 <- ggplot(wide_df, aes(x=meanWR, y=meanO, color=gcpct)) +
geom_point(alpha=0.4) +
scale_color_distiller('GC%', palette='RdYlBu', limits=c(55, 95),
guide=guide_colorbar(direction='horizontal')) +
geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
coord_cartesian(xlim=c(0, 55), ylim=c(0, 55)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanWR', 'meanO'), parse=TRUE) +
ggtitle('') +
xlab('WGBS (%)') +
ylab('ONT Cas9 (%)') +
theme_minimal() +
theme(legend.position=c(0.75, 0.1))
# do, like, proper stats
wide_df$delta_wgbs_emseq <- wide_df$meanWR - wide_df$meanER
summary(lm(wide_df$delta_wgbs_emseq ~ wide_df$gcpct))
##
## Call:
## lm(formula = wide_df$delta_wgbs_emseq ~ wide_df$gcpct)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.6962 -2.8369 -0.4385 2.4450 20.8407
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.588541 0.608575 -25.61 <2e-16 ***
## wide_df$gcpct 0.225857 0.007925 28.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.959 on 3327 degrees of freedom
## Multiple R-squared: 0.1962, Adjusted R-squared: 0.196
## F-statistic: 812.2 on 1 and 3327 DF, p-value: < 2.2e-16
g6 <- ggplot(wide_df, aes(x=gcpct, y=delta_wgbs_emseq)) +
geom_point(alpha=0.2) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
coord_cartesian(xlim=c(40, 95), ylim=c(-20, 40)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'gcpct', 'delta_wgbs_emseq'), parse=TRUE) +
ggtitle('Per-position residual methylation levels vs. GC%') +
xlab('GC%') +
ylab('Residual WGBS - EM-seq (%)') +
theme_minimal(12)
wide_df$delta_ont_emseq <- wide_df$meanO - wide_df$meanER
g7 <- ggplot(wide_df, aes(x=gcpct, y=delta_ont_emseq)) +
geom_point(alpha=0.2) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
coord_cartesian(xlim=c(40, 95), ylim=c(-20, 40)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'gcpct', 'delta_ont_emseq'), parse=TRUE) +
ggtitle('') +
xlab('GC%') +
ylab('Residual ONT Cas9 - EM-seq (%)') +
theme_minimal(12)
wide_df$delta_ont_wgbs <- wide_df$meanO - wide_df$meanWR
g8 <- ggplot(wide_df, aes(x=gcpct, y=delta_ont_wgbs)) +
geom_point(alpha=0.2) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
coord_cartesian(xlim=c(40, 95), ylim=c(-20, 40)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'gcpct', 'delta_ont_wgbs'), parse=TRUE) +
ggtitle('') +
xlab('GC%') +
ylab('Residual ONT Cas9 - WGBS (%)') +
theme_minimal(12)
plot_grid(g3, g6, g4, g7, g5, g8, ncol=2, rel_heights=c(1, 1, 1),
labels=c('A', 'B', 'C', 'D', 'E', 'F'))

ggsave('three-way.beta_and_gc.pdf', width=10, height=12)
# list deps used in this script
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux bookworm/sid
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blis-openmp/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] RColorBrewer_1.1-2 ggplot2_3.3.5 eulerr_6.1.1 cowplot_1.1.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.8 bslib_0.3.1 compiler_4.1.2 pillar_1.6.4
## [5] jquerylib_0.1.4 highr_0.9 tools_4.1.2 digest_0.6.29
## [9] lattice_0.20-45 nlme_3.1-155 jsonlite_1.7.3 evaluate_0.14
## [13] lifecycle_1.0.1 tibble_3.1.6 gtable_0.3.0 mgcv_1.8-38
## [17] pkgconfig_2.0.3 rlang_0.4.12 Matrix_1.4-0 DBI_1.1.2
## [21] yaml_2.2.1 xfun_0.29 fastmap_1.1.0 withr_2.4.3
## [25] stringr_1.4.0 dplyr_1.0.7 knitr_1.37 generics_0.1.1
## [29] sass_0.4.0 vctrs_0.3.8 tidyselect_1.1.1 grid_4.1.2
## [33] glue_1.6.0 R6_2.5.1 fansi_1.0.2 rmarkdown_2.11
## [37] polyclip_1.10-0 farver_2.1.0 polylabelr_0.2.0 purrr_0.3.4
## [41] magrittr_2.0.1 splines_4.1.2 scales_1.1.1 htmltools_0.5.2
## [45] ellipsis_0.3.2 assertthat_0.2.1 colorspace_2.0-2 labeling_0.4.2
## [49] utf8_1.2.2 stringi_1.7.6 munsell_0.5.0 crayon_1.4.2